Overview

This guide accompanies Davis Vaughan’s rstudio::conf 2018 talk, “The future of time series and financial analysis in the tidyverse.”

Required packages

Unfortunately some of these packages are a bit finicky in what dependencies they require.

qqmap v2.6.1 (CRAN) requires ggplot2 v2.2.1 (CRAN) and will not work with the dev version of ggplot2.

patchwork (Github only) requires and will automatically install the development version of ggplot2.

This causes problems if you want to run this code. I would advise running everything 1 chunk at a time with the CRAN version of ggplot2 until you get down to the performance summary section. At that point, install patchwork with devtools::install_github("thomasp85/patchwork") and run the rest of the performance summary.

# Business Science dev packages
# devtools::install_github("business-science/tibbletime")
# devtools::install_github("business-science/tidyquant2")
library(tibbletime)
library(tidyquant2)

# General packages
library(dplyr)
library(readr)
library(tidyr)

# Used for themes and getting data
library(tidyquant)

# For the mapping example
library(ggmap)
library(gganimate)

# For rolling linear models
library(broom)

Tidyquant example

tq_get("AAPL") %>%
  tq_mutate(select = adjusted, mutate_fun = dailyReturn) %>%
  ggplot(aes(x = date, y = daily.returns)) +
  geom_line() + 
  theme_tq()

Airbnb data

airbnb <- read_csv("../data/tomslee_airbnb_san_diego_1436_2017-07-11.csv") %>%
  as_tbl_time(last_modified) %>%
  arrange(last_modified) %>%
  select(last_modified, price, latitude, longitude)
## Parsed with column specification:
## cols(
##   room_id = col_integer(),
##   survey_id = col_integer(),
##   host_id = col_integer(),
##   room_type = col_character(),
##   country = col_character(),
##   city = col_character(),
##   borough = col_character(),
##   neighborhood = col_character(),
##   reviews = col_integer(),
##   overall_satisfaction = col_double(),
##   accommodates = col_integer(),
##   bedrooms = col_double(),
##   bathrooms = col_character(),
##   price = col_double(),
##   minstay = col_character(),
##   last_modified = col_datetime(format = ""),
##   latitude = col_double(),
##   longitude = col_double(),
##   location = col_character()
## )
airbnb
## # A time tibble: 9,111 x 4
## # Index: last_modified
##    last_modified       price latitude longitude
##    <dttm>              <dbl>    <dbl>     <dbl>
##  1 2017-07-11 15:05:35  30.0     32.7      -117
##  2 2017-07-11 15:05:36  25.0     32.8      -117
##  3 2017-07-11 15:05:36  32.0     32.7      -117
##  4 2017-07-11 15:05:36  32.0     32.8      -117
##  5 2017-07-11 15:05:36  35.0     32.7      -117
##  6 2017-07-11 15:05:36  25.0     32.7      -117
##  7 2017-07-11 15:05:36  34.0     32.9      -117
##  8 2017-07-11 15:05:36  33.0     32.7      -117
##  9 2017-07-11 15:05:36  35.0     32.8      -117
## 10 2017-07-11 15:05:36  29.0     32.8      -117
## # ... with 9,101 more rows

Slicing up your time series

The dplyr way:

airbnb %>%
  filter(
    last_modified >= as.POSIXct("2017-07-12 02:00:00", tz = "UTC"),     
    last_modified <= as.POSIXct("2017-07-12 02:59:59", tz = "UTC")
)
## # A time tibble: 67 x 4
## # Index: last_modified
##    last_modified       price latitude longitude
##    <dttm>              <dbl>    <dbl>     <dbl>
##  1 2017-07-12 02:06:01   500     32.7      -117
##  2 2017-07-12 02:13:36   667     32.8      -117
##  3 2017-07-12 02:14:37   575     33.0      -117
##  4 2017-07-12 02:15:02   678     33.0      -117
##  5 2017-07-12 02:16:05   575     32.8      -117
##  6 2017-07-12 02:18:44   800     32.8      -117
##  7 2017-07-12 02:18:47   724     32.8      -117
##  8 2017-07-12 02:18:47   825     32.7      -117
##  9 2017-07-12 02:18:47   900     32.8      -117
## 10 2017-07-12 02:18:47   989     32.8      -117
## # ... with 57 more rows

tibbletime filter_time()

airbnb %>%
  filter_time("2017-07-12 02:00:00" ~ "2017-07-12 02:59:59")
## # A time tibble: 67 x 4
## # Index: last_modified
##    last_modified       price latitude longitude
##    <dttm>              <dbl>    <dbl>     <dbl>
##  1 2017-07-12 02:06:01   500     32.7      -117
##  2 2017-07-12 02:13:36   667     32.8      -117
##  3 2017-07-12 02:14:37   575     33.0      -117
##  4 2017-07-12 02:15:02   678     33.0      -117
##  5 2017-07-12 02:16:05   575     32.8      -117
##  6 2017-07-12 02:18:44   800     32.8      -117
##  7 2017-07-12 02:18:47   724     32.8      -117
##  8 2017-07-12 02:18:47   825     32.7      -117
##  9 2017-07-12 02:18:47   900     32.8      -117
## 10 2017-07-12 02:18:47   989     32.8      -117
## # ... with 57 more rows
airbnb %>%
  filter_time(~"2017-07-12 02")
## # A time tibble: 67 x 4
## # Index: last_modified
##    last_modified       price latitude longitude
##    <dttm>              <dbl>    <dbl>     <dbl>
##  1 2017-07-12 02:06:01   500     32.7      -117
##  2 2017-07-12 02:13:36   667     32.8      -117
##  3 2017-07-12 02:14:37   575     33.0      -117
##  4 2017-07-12 02:15:02   678     33.0      -117
##  5 2017-07-12 02:16:05   575     32.8      -117
##  6 2017-07-12 02:18:44   800     32.8      -117
##  7 2017-07-12 02:18:47   724     32.8      -117
##  8 2017-07-12 02:18:47   825     32.7      -117
##  9 2017-07-12 02:18:47   900     32.8      -117
## 10 2017-07-12 02:18:47   989     32.8      -117
## # ... with 57 more rows

More examples

# Filter for all days in 2017
airbnb %>%
  filter_time(~"2017")
## # A time tibble: 9,111 x 4
## # Index: last_modified
##    last_modified       price latitude longitude
##    <dttm>              <dbl>    <dbl>     <dbl>
##  1 2017-07-11 15:05:35  30.0     32.7      -117
##  2 2017-07-11 15:05:36  25.0     32.8      -117
##  3 2017-07-11 15:05:36  32.0     32.7      -117
##  4 2017-07-11 15:05:36  32.0     32.8      -117
##  5 2017-07-11 15:05:36  35.0     32.7      -117
##  6 2017-07-11 15:05:36  25.0     32.7      -117
##  7 2017-07-11 15:05:36  34.0     32.9      -117
##  8 2017-07-11 15:05:36  33.0     32.7      -117
##  9 2017-07-11 15:05:36  35.0     32.8      -117
## 10 2017-07-11 15:05:36  29.0     32.8      -117
## # ... with 9,101 more rows
# All days in July to the end of August
airbnb %>%
  filter_time("2017-07" ~ "2017-08")
## # A time tibble: 9,111 x 4
## # Index: last_modified
##    last_modified       price latitude longitude
##    <dttm>              <dbl>    <dbl>     <dbl>
##  1 2017-07-11 15:05:35  30.0     32.7      -117
##  2 2017-07-11 15:05:36  25.0     32.8      -117
##  3 2017-07-11 15:05:36  32.0     32.7      -117
##  4 2017-07-11 15:05:36  32.0     32.8      -117
##  5 2017-07-11 15:05:36  35.0     32.7      -117
##  6 2017-07-11 15:05:36  25.0     32.7      -117
##  7 2017-07-11 15:05:36  34.0     32.9      -117
##  8 2017-07-11 15:05:36  33.0     32.7      -117
##  9 2017-07-11 15:05:36  35.0     32.8      -117
## 10 2017-07-11 15:05:36  29.0     32.8      -117
## # ... with 9,101 more rows
# Start through the end of December
airbnb %>%
  filter_time("start" ~ "2017-12")
## # A time tibble: 9,111 x 4
## # Index: last_modified
##    last_modified       price latitude longitude
##    <dttm>              <dbl>    <dbl>     <dbl>
##  1 2017-07-11 15:05:35  30.0     32.7      -117
##  2 2017-07-11 15:05:36  25.0     32.8      -117
##  3 2017-07-11 15:05:36  32.0     32.7      -117
##  4 2017-07-11 15:05:36  32.0     32.8      -117
##  5 2017-07-11 15:05:36  35.0     32.7      -117
##  6 2017-07-11 15:05:36  25.0     32.7      -117
##  7 2017-07-11 15:05:36  34.0     32.9      -117
##  8 2017-07-11 15:05:36  33.0     32.7      -117
##  9 2017-07-11 15:05:36  35.0     32.8      -117
## 10 2017-07-11 15:05:36  29.0     32.8      -117
## # ... with 9,101 more rows

A new way to group

collapse_by(airbnb, period = "1 day")
## # A time tibble: 9,111 x 4
## # Index: last_modified
##    last_modified       price latitude longitude
##    <dttm>              <dbl>    <dbl>     <dbl>
##  1 2017-07-11 22:58:12  30.0     32.7      -117
##  2 2017-07-11 22:58:12  25.0     32.8      -117
##  3 2017-07-11 22:58:12  32.0     32.7      -117
##  4 2017-07-11 22:58:12  32.0     32.8      -117
##  5 2017-07-11 22:58:12  35.0     32.7      -117
##  6 2017-07-11 22:58:12  25.0     32.7      -117
##  7 2017-07-11 22:58:12  34.0     32.9      -117
##  8 2017-07-11 22:58:12  33.0     32.7      -117
##  9 2017-07-11 22:58:12  35.0     32.8      -117
## 10 2017-07-11 22:58:12  29.0     32.8      -117
## # ... with 9,101 more rows
collapse_by(airbnb, period = "1 day") %>% tail
## # A time tibble: 6 x 4
## # Index: last_modified
##   last_modified       price latitude longitude
##   <dttm>              <dbl>    <dbl>     <dbl>
## 1 2017-07-12 05:20:42  73.0     32.8      -117
## 2 2017-07-12 05:20:42  68.0     32.7      -117
## 3 2017-07-12 05:20:42  90.0     32.8      -117
## 4 2017-07-12 05:20:42  90.0     32.8      -117
## 5 2017-07-12 05:20:42 350       32.7      -117
## 6 2017-07-12 05:20:42 100       32.8      -117

Collapse and summarise

# Collapse by 2 hour periods, summarise median price
airbnb %>%
  collapse_by(period = "2 hour") %>%
  group_by(last_modified) %>%
  summarise(median_price = median(price))
## # A time tibble: 8 x 2
## # Index: last_modified
##   last_modified       median_price
##   <dttm>                     <dbl>
## 1 2017-07-11 15:59:42         55.0
## 2 2017-07-11 17:59:54        100  
## 3 2017-07-11 19:59:57        199  
## 4 2017-07-11 21:48:16        450  
## 5 2017-07-11 22:58:12        152  
## 6 2017-07-12 00:59:43        285  
## 7 2017-07-12 03:59:26        882  
## 8 2017-07-12 05:20:42         40.0
# Clean and round up
airbnb %>%
  collapse_by(period = "2 hour", clean = TRUE) %>%
  group_by(last_modified) %>%
  summarise(median_price = median(price))
## # A time tibble: 8 x 2
## # Index: last_modified
##   last_modified       median_price
##   <dttm>                     <dbl>
## 1 2017-07-11 16:00:00         55.0
## 2 2017-07-11 18:00:00        100  
## 3 2017-07-11 20:00:00        199  
## 4 2017-07-11 22:00:00        450  
## 5 2017-07-12 00:00:00        152  
## 6 2017-07-12 02:00:00        285  
## 7 2017-07-12 04:00:00        882  
## 8 2017-07-12 06:00:00         40.0
# Clean and round down
airbnb %>%
  collapse_by(period = "2 hour", clean = TRUE, side = "start") %>%
  group_by(last_modified) %>%
  summarise(median_price = median(price))
## # A time tibble: 8 x 2
## # Index: last_modified
##   last_modified       median_price
##   <dttm>                     <dbl>
## 1 2017-07-11 14:00:00         55.0
## 2 2017-07-11 16:00:00        100  
## 3 2017-07-11 18:00:00        199  
## 4 2017-07-11 20:00:00        450  
## 5 2017-07-11 22:00:00        152  
## 6 2017-07-12 00:00:00        285  
## 7 2017-07-12 02:00:00        882  
## 8 2017-07-12 04:00:00         40.0

The possibilities are endless

This works with ggmap v2.6.1 (CRAN) and ggplot2 v2.2.1 (CRAN). It is a bit finicky with earlier version / dev versions of either package.

airbnb_plot <- airbnb %>%
  
  # Collapse and clean
  collapse_by(period = "hour", clean = TRUE) %>%
  
  # Throw out a few outliers
  filter(
    between(price, quantile(price, .05), quantile(price, .95))
  ) %>%
  
  # Map and animate
  qmplot(longitude, latitude, data = ., geom = "blank") +
  geom_point(
    aes(color = price, size = price, frame = last_modified), 
    alpha = .5) +
  scale_color_continuous(low = "red", high = "blue")
## Using zoom = 11...
## Map from URL : http://tile.stamen.com/toner-lite/11/356/824.png
## Map from URL : http://tile.stamen.com/toner-lite/11/357/824.png
## Map from URL : http://tile.stamen.com/toner-lite/11/358/824.png
## Map from URL : http://tile.stamen.com/toner-lite/11/356/825.png
## Map from URL : http://tile.stamen.com/toner-lite/11/357/825.png
## Map from URL : http://tile.stamen.com/toner-lite/11/358/825.png
## Map from URL : http://tile.stamen.com/toner-lite/11/356/826.png
## Map from URL : http://tile.stamen.com/toner-lite/11/357/826.png
## Map from URL : http://tile.stamen.com/toner-lite/11/358/826.png
## Map from URL : http://tile.stamen.com/toner-lite/11/356/827.png
## Map from URL : http://tile.stamen.com/toner-lite/11/357/827.png
## Map from URL : http://tile.stamen.com/toner-lite/11/358/827.png
## Map from URL : http://tile.stamen.com/toner-lite/11/356/828.png
## Map from URL : http://tile.stamen.com/toner-lite/11/357/828.png
## Map from URL : http://tile.stamen.com/toner-lite/11/358/828.png
## Warning: `panel.margin` is deprecated. Please use `panel.spacing` property
## instead
## Warning: Ignoring unknown aesthetics: frame
gganimate(airbnb_plot)

Let’s get things rolling - rolling averages

data(FB, package = "tibbletime")

short_term_mean <- rollify(mean, window = 5)
long_term_mean  <- rollify(mean, window = 50)

FB_roll <- FB %>%
  mutate(short_mean = short_term_mean(adjusted),
         long_mean  = long_term_mean(adjusted)) 

FB_roll %>%
  select(date, adjusted, short_mean, long_mean)
## # A tibble: 1,008 x 4
##    date       adjusted short_mean long_mean
##    <date>        <dbl>      <dbl>     <dbl>
##  1 2013-01-02     28.0       NA          NA
##  2 2013-01-03     27.8       NA          NA
##  3 2013-01-04     28.8       NA          NA
##  4 2013-01-07     29.4       NA          NA
##  5 2013-01-08     29.1       28.6        NA
##  6 2013-01-09     30.6       29.1        NA
##  7 2013-01-10     31.3       29.8        NA
##  8 2013-01-11     31.7       30.4        NA
##  9 2013-01-14     31.0       30.7        NA
## 10 2013-01-15     30.1       30.9        NA
## # ... with 998 more rows

Moving average plot

FB_roll %>%
  gather(key = "Indicator", value = "value", short_mean, long_mean, adjusted) %>%
  ggplot(aes(x = date, y = value, color = Indicator)) +
  geom_line() +
  labs(x = "Date", y = "Price", title = "FB Adjusted stock price with long/short term moving averages") +
  theme_minimal()
## Warning: Removed 53 rows containing missing values (geom_path).

Let’s get things rolling - rolling linear models

lm_roll <- rollify(
  .f     = ~ lm(.y ~ .x), 
  window = 5, unlist = FALSE)

FB_model <- FB %>%
  mutate(
    lag_volume = lag(volume),
    model = lm_roll(lag_volume, adjusted)
  ) 

FB_model
## # A tibble: 1,008 x 10
##    symbol date        open  high   low close    volume adjusted lag_volume
##    <chr>  <date>     <dbl> <dbl> <dbl> <dbl>     <dbl>    <dbl>      <dbl>
##  1 FB     2013-01-02  27.4  28.2  27.4  28.0  69846400     28.0         NA
##  2 FB     2013-01-03  27.9  28.5  27.6  27.8  63140600     27.8   69846400
##  3 FB     2013-01-04  28.0  28.9  27.8  28.8  72715400     28.8   63140600
##  4 FB     2013-01-07  28.7  29.8  28.6  29.4  83781800     29.4   72715400
##  5 FB     2013-01-08  29.5  29.6  28.9  29.1  45871300     29.1   83781800
##  6 FB     2013-01-09  29.7  30.6  29.5  30.6 104787700     30.6   45871300
##  7 FB     2013-01-10  30.6  31.5  30.3  31.3  95316400     31.3  104787700
##  8 FB     2013-01-11  31.3  32.0  31.1  31.7  89598000     31.7   95316400
##  9 FB     2013-01-14  32.1  32.2  30.6  31.0  98892800     31.0   89598000
## 10 FB     2013-01-15  30.6  31.7  29.9  30.1 173242600     30.1   98892800
## # ... with 998 more rows, and 1 more variable: model <list>

Looking at your model results with broom.

FB_model %>%
  filter(!is.na(model)) %>%
  mutate(glanced = map(model, glance)) %>%
  select(date, glanced) %>%
  unnest()
## # A tibble: 1,004 x 12
##    date       r.squared adj.r.squared sigma statistic p.value    df logLik
##    <date>         <dbl>         <dbl> <dbl>     <dbl>   <dbl> <int>  <dbl>
##  1 2013-01-08 0.113           -0.330  0.817  0.255      0.664     2  -3.48
##  2 2013-01-09 0.326            0.102  0.972  1.45       0.314     2  -5.67
##  3 2013-01-10 0.0895          -0.214  1.19   0.295      0.625     2  -6.68
##  4 2013-01-11 0.130           -0.159  1.24   0.450      0.550     2  -6.91
##  5 2013-01-14 0.106           -0.193  1.11   0.354      0.594     2  -6.36
##  6 2013-01-15 0.0861          -0.219  0.691  0.282      0.632     2  -3.97
##  7 2013-01-16 0.426            0.235  0.693  2.23       0.233     2  -3.98
##  8 2013-01-17 0.180           -0.0932 0.808  0.659      0.476     2  -4.75
##  9 2013-01-18 0.0000962       -0.333  0.569  0.000289   0.988     2  -3.00
## 10 2013-01-22 0.0845          -0.221  0.447  0.277      0.635     2  -1.79
## # ... with 994 more rows, and 4 more variables: AIC <dbl>, BIC <dbl>,
## #   deviance <dbl>, df.residual <int>

tidyfinance - FANG

data(FANG, package = "tibbletime")

FANG_time <- FANG %>%
  group_by(symbol) %>% 
  as_tbl_time(date)

slice(FANG_time, 1:2)
## # A time tibble: 8 x 8
## # Index:  date
## # Groups: symbol [4]
##   symbol date        open  high   low close   volume adjusted
##   <chr>  <date>     <dbl> <dbl> <dbl> <dbl>    <dbl>    <dbl>
## 1 AMZN   2013-01-02 256   258   253   257    3271000    257  
## 2 AMZN   2013-01-03 257   261   256   258    2750900    258  
## 3 FB     2013-01-02  27.4  28.2  27.4  28.0 69846400     28.0
## 4 FB     2013-01-03  27.9  28.5  27.6  27.8 63140600     27.8
## 5 GOOG   2013-01-02 719   727   717   723    5101500    361  
## 6 GOOG   2013-01-03 725   732   721   724    4653700    361  
## 7 NFLX   2013-01-02  95.2  95.8  90.7  92.0 19431300     13.1
## 8 NFLX   2013-01-03  92.0  97.9  91.5  96.6 27912500     13.8
FANG_return <- FANG_time %>%
  select(symbol, date, adjusted) %>%
  calculate_return(adjusted, period = "daily") %>%
  mutate(drawdown = drawdown(adjusted_return),
         cum_ret  = cumulative_return(adjusted_return))

FANG_return
## # A time tibble: 4,032 x 6
## # Index:  date
## # Groups: symbol [4]
##    symbol date       adjusted adjusted_return drawdown  cum_ret
##    <chr>  <date>        <dbl>           <dbl>    <dbl>    <dbl>
##  1 FB     2013-01-02     28.0         0        0        0      
##  2 FB     2013-01-03     27.8        -0.00821 -0.00821 -0.00821
##  3 FB     2013-01-04     28.8         0.0356   0        0.0271 
##  4 FB     2013-01-07     29.4         0.0229   0        0.0507 
##  5 FB     2013-01-08     29.1        -0.0122  -0.0122   0.0379 
##  6 FB     2013-01-09     30.6         0.0526   0        0.0925 
##  7 FB     2013-01-10     31.3         0.0232   0        0.118  
##  8 FB     2013-01-11     31.7         0.0134   0        0.133  
##  9 FB     2013-01-14     31.0        -0.0243  -0.0243   0.105  
## 10 FB     2013-01-15     30.1        -0.0275  -0.0511   0.0750 
## # ... with 4,022 more rows

tidyfinance + tibbletime

FANG_return_monthly <- FANG_return %>%
  collapse_by("month") %>%
  group_by(symbol, date) %>%
  summarise(monthly_return = total_return(adjusted_return))

FANG_return_monthly
## # A time tibble: 192 x 3
## # Index:  date
## # Groups: symbol [?]
##    symbol date       monthly_return
##    <chr>  <date>              <dbl>
##  1 AMZN   2013-01-31        0.0318 
##  2 AMZN   2013-02-28       -0.00463
##  3 AMZN   2013-03-28        0.00840
##  4 AMZN   2013-04-30       -0.0476 
##  5 AMZN   2013-05-31        0.0606 
##  6 AMZN   2013-06-28        0.0315 
##  7 AMZN   2013-07-31        0.0847 
##  8 AMZN   2013-08-30       -0.0672 
##  9 AMZN   2013-09-30        0.113  
## 10 AMZN   2013-10-31        0.164  
## # ... with 182 more rows

Performance summary

Cumulative returns

plot_cum_ret <- FANG_return %>%
  ggplot(aes(x = date, y = cum_ret, color = symbol)) +
  geom_line() +
  theme_tq() +
  theme(axis.title.x = element_blank(),
        axis.text.x  = element_blank(),
        axis.ticks.x = element_blank()) +
  labs(
   y     = "Cumulative Return", 
   title = "Performance summary: Facebook, Amazon, Netflix, Google") +
  theme(legend.position="none") +
  scale_color_tq()

Monthly returns

plot_month_ret <- FANG_return_monthly %>%
  ggplot(aes(x = date, y = monthly_return, fill = symbol)) +
  geom_col(width = 15, position = position_dodge()) +
  theme_tq() +
  theme(axis.title.x = element_blank(),
        axis.text.x  = element_blank(),
        axis.ticks.x = element_blank()) +
  labs(y = "Monthly Return") +
  theme(legend.position="none") +
  scale_fill_tq()

Drawdown

plot_drawdown <- FANG_return %>%
  ggplot(aes(x = date, y = drawdown, fill = symbol)) +
  geom_area(position = position_identity(), alpha = .7) +
  theme_tq() +
  scale_x_date(
    date_breaks = "3 months", 
    date_labels = "%b %Y") +
  labs(x = "", y = "Drawdown") +
  scale_fill_tq()

Performance summary with patchwork

At this point you will need patchwork to run the following code, uncomment the lines below to first install patchwork from github. It should also install the dev version of ggplot2. Then you will likely need to restart R and rerun the code that involves creating the 3 FANG charts that will be added together (everything after the tidyfinance - FANG chunk). Do not try and run the map code again, as it will not work now that you have the dev version of ggplot2.

# # For performance summary plots
# devtools::install_github("thomasp85/patchwork", force = TRUE)
# library(patchwork)
# 
# plot_cum_ret +
#   plot_month_ret +
#   plot_drawdown +
#   plot_layout(ncol = 1, heights = c(2, 1, 1))